Titanic tragedy data

Reading RAW training data

titanic = read.csv('Titanic_train.csv')
## Warning in file(file, "rt"): cannot open file 'Titanic_train.csv': No such
## file or directory
## Error in file(file, "rt"): cannot open the connection

Look at the first few rows

What would be some good features to consider here?

options(width = 110)
head(titanic)
## Error in head(titanic): object 'titanic' not found

What is the data type of each column?

sapply(titanic,class)
## Error in lapply(X = X, FUN = FUN, ...): object 'titanic' not found

Converting class label to a factor

titanic$Survived = factor(titanic$Survived, labels=c("died", "survived"))
## Error in factor(titanic$Survived, labels = c("died", "survived")): object 'titanic' not found
titanic$Embarked = factor(titanic$Embarked, labels=c("unkown", "Cherbourg", "Queenstown", "Southampton"))
## Error in factor(titanic$Embarked, labels = c("unkown", "Cherbourg", "Queenstown", : object 'titanic' not found
sapply(titanic,class)
## Error in lapply(X = X, FUN = FUN, ...): object 'titanic' not found
str(titanic$Survived)
## Error in str(titanic$Survived): object 'titanic' not found
str(titanic$Sex)
## Error in str(titanic$Sex): object 'titanic' not found

Class distribution - PIE Charts

survivedTable = table(titanic$Survived)
## Error in table(titanic$Survived): object 'titanic' not found
survivedTable
## Error in eval(expr, envir, enclos): object 'survivedTable' not found
par(mar = c(0, 0, 0, 0), oma = c(0, 0, 0, 0))
pie(survivedTable,labels=c("Died","Survived"))
## Error in pie(survivedTable, labels = c("Died", "Survived")): object 'survivedTable' not found

Is Sex a good predictor?

male = titanic[titanic$Sex=="male",]
## Error in eval(expr, envir, enclos): object 'titanic' not found
female = titanic[titanic$Sex=="female",]
## Error in eval(expr, envir, enclos): object 'titanic' not found
par(mfrow = c(1, 2), mar = c(0, 0, 2, 0), oma = c(0, 1, 0, 1))
pie(table(male$Survived),labels=c("Dead","Survived"),  main="Survival Portion Among Men")
## Error in table(male$Survived): object 'male' not found
pie(table(female$Survived),labels=c("Dead","Survived"), main="Survival Portion Among Women")
## Error in table(female$Survived): object 'female' not found

Is Age a good predictor?

Age <- titanic$Age; summary(Age)
## Error in eval(expr, envir, enclos): object 'titanic' not found
## Error in summary(Age): object 'Age' not found

How about summary segmented by survival

summary(titanic[titanic$Survived=="died",]$Age)
## Error in summary(titanic[titanic$Survived == "died", ]$Age): object 'titanic' not found
summary(titanic[titanic$Survived=="survived",]$Age)
## Error in summary(titanic[titanic$Survived == "survived", ]$Age): object 'titanic' not found

Age distribution by Survival and Sex

par(mfrow = c(1, 2), mar = c(4, 4, 2, 2), oma = c(1, 1, 1, 1))
boxplot(titanic$Age~titanic$Sex, main="Age Distribution By Gender",col=c("red","green"))
## Error in eval(expr, envir, enclos): object 'titanic' not found
boxplot(titanic$Age~titanic$Survived, main="Age Distribution By Survival",col=c("red","green"),
        xlab="0:Died 1:Survived",ylab="Age")
## Error in eval(expr, envir, enclos): object 'titanic' not found

Histogram of Age

hist(Age, col="blue", xlab="Age", ylab="Frequency",
     main = "Distribution of Passenger Ages on Titanic")
## Error in hist(Age, col = "blue", xlab = "Age", ylab = "Frequency", main = "Distribution of Passenger Ages on Titanic"): object 'Age' not found

Kernel density plot of age

d = density(na.omit(Age)) # density(Age) won't work, need to omit all NAs
## Error in na.omit(Age): object 'Age' not found
plot(d, main = "kernel density of Ages of Titanic Passengers")
## Error in plot(d, main = "kernel density of Ages of Titanic Passengers"): object 'd' not found
polygon(d, col="red", border="blue")
## Error in xy.coords(x, y): object 'd' not found

Comparison of density plots of Age with different Sex

## Error in na.omit(titanic): object 'titanic' not found
## Package 'sm', version 2.2-5.4: type help(sm) for summary information
## Error in is.vector(x): object 'titanic_na_removed' not found
## Error in title(main = "Kernel Density Plot of Ages By Sex"): plot.new has not been called yet
## Error in levels(titanic_na_removed$Sex): object 'titanic_na_removed' not found
## Error in levels(titanic_na_removed$Sex): object 'titanic_na_removed' not found

Did Age have an impact on survival?

## Error in plot(d, main = "kernel density of Ages of Titanic Passengers", : object 'd' not found
## Error in xy.coords(x, y): object 'd' not found
## Error in is.vector(x): object 'titanic_na_removed' not found
## Error in title(main = "Kernel Density Plot of Ages By Sex", cex.main = 3): plot.new has not been called yet
## Error in levels(titanic_na_removed$Sex): object 'titanic_na_removed' not found
## Error in levels(titanic_na_removed$Sex): object 'titanic_na_removed' not found
## Error in is.vector(x): object 'titanic_na_removed' not found
## Error in title(main = "Kernel Density Plot of Ages By Survival", cex.main = 3): plot.new has not been called yet
## Error in levels(titanic_na_removed$Survived): object 'titanic_na_removed' not found
## Error in levels(titanic_na_removed$Survived): object 'titanic_na_removed' not found

Create categorical groupings: Adult vs Child

An example of feature engineering!

## Multi dimensional comparison            
Child <- titanic$Age # Isolating age.
## Error in eval(expr, envir, enclos): object 'titanic' not found
## Now we need to create categories: NA = Unknown, 1 = Child, 2 = Adult
## Every age below 13 (exclusive) is classified into age group 1
Child[Child<13] <- 1
## Error in Child[Child < 13] <- 1: object 'Child' not found
## Every child 13 or above is classified into age group 2
Child[Child>=13] <- 2
## Error in Child[Child >= 13] <- 2: object 'Child' not found
# Use labels instead of 0's and 1's
Child[Child==1] <- "Child"
## Error in Child[Child == 1] <- "Child": object 'Child' not found
Child[Child==2] <- "Adult"
## Error in Child[Child == 2] <- "Adult": object 'Child' not found
# Appends the new column to the titanic dataset
titanic_with_child_column <- cbind(titanic, Child)
## Error in cbind(titanic, Child): object 'titanic' not found
# Removes rows where age is NA
titanic_with_child_column <- titanic_with_child_column[!is.na(titanic_with_child_column$Child),]
## Error in eval(expr, envir, enclos): object 'titanic_with_child_column' not found

Fare matters?

## Error in ggplot(titanic_with_child_column, aes(y = Fare, x = Survived)): object 'titanic_with_child_column' not found

How about fare, ship class, port embarkation?

## Error in is.factor(x): object 'titanic' not found
## Error in ggplot(titanic, aes(y = Fare, x = Pclass)): object 'titanic' not found

Diamond data

Overview of the diamond data

data(diamonds) # loading diamonds data set
head(diamonds, 16) # first few rows of diamond data set
##    carat       cut color clarity depth table price    x    y    z
## 1   0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
## 2   0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
## 3   0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31
## 4   0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
## 5   0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
## 6   0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48
## 7   0.24 Very Good     I    VVS1  62.3    57   336 3.95 3.98 2.47
## 8   0.26 Very Good     H     SI1  61.9    55   337 4.07 4.11 2.53
## 9   0.22      Fair     E     VS2  65.1    61   337 3.87 3.78 2.49
## 10  0.23 Very Good     H     VS1  59.4    61   338 4.00 4.05 2.39
## 11  0.30      Good     J     SI1  64.0    55   339 4.25 4.28 2.73
## 12  0.23     Ideal     J     VS1  62.8    56   340 3.93 3.90 2.46
## 13  0.22   Premium     F     SI1  60.4    61   342 3.88 3.84 2.33
## 14  0.31     Ideal     J     SI2  62.2    54   344 4.35 4.37 2.71
## 15  0.20   Premium     E     SI2  60.2    62   345 3.79 3.75 2.27
## 16  0.32   Premium     E      I1  60.9    58   345 4.38 4.42 2.68

Histogram of carat

library(ggplot2)
ggplot(data=diamonds) + geom_histogram(aes(x=carat))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-20

Density plot of carat

ggplot(data=diamonds) + 
geom_density(aes(x=carat),fill="gray50")

plot of chunk unnamed-chunk-21

Scatter plots (carat vs. price)

ggplot(diamonds, aes(x=carat,y=price)) + geom_point()

plot of chunk unnamed-chunk-22

Carat with colors

g = ggplot(diamonds, aes(x=carat, y=price)) # saving first layer as variable
g + geom_point(aes(color=color)) # rendering first layer and adding another layer

plot of chunk unnamed-chunk-23

Carat with colors (more details)

g + geom_point(aes(color=color)) + facet_wrap(~color)

plot of chunk unnamed-chunk-24

Let's consider cut and clarity

plot of chunk unnamed-chunk-25

Your trun!

What is your knowledge of diamond's price after exploring this data?